Load data
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.0 ──
## ✔ broom 1.0.5 ✔ rsample 1.1.1
## ✔ dials 1.2.0 ✔ tibble 3.2.1
## ✔ dplyr 1.1.2 ✔ tidyr 1.3.0
## ✔ infer 1.0.4 ✔ tune 1.1.1
## ✔ modeldata 1.1.0 ✔ workflows 1.1.3
## ✔ parsnip 1.1.0 ✔ workflowsets 1.0.1
## ✔ purrr 1.0.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.6
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# Values of conditions to test
flow.rate <- 0 # 0 0.6 3 5.9 8.9
chlorophyll <- 0 # 0 4.3 4.6 5.5 6.1 7.6 13.5 19
guano <- 1 # Absent=1 Present=2
light <- 1 # Absent=1 Present=2
# Vectors of all possible conditions combinations
frs <- as.numeric(as.character(unique(CC.TotalData$Flow.rate)))
chls <- as.numeric(as.character(unique(CC.TotalData$Chlorophyll)))
guans <- c(1,2)
lights <- c(1,2)
conditions <- expand.grid(frs,chls,guans,lights)
Autocorrelation looking at linear regression and residuals. Note that residuals are not normally distributed.
velocity <- CC.TotalData$v[
(CC.TotalData$Flow.rate==flow.rate &
CC.TotalData$Chlorophyll==chlorophyll &
as.numeric(CC.TotalData$Guano)==guano &
as.numeric(CC.TotalData$Light)==light)]
x <- log10(velocity[1:length(velocity)-1])
y <- log10(velocity[2:length(velocity)])
vel.auto.lm <- lm(x ~ y + 1)
summary(vel.auto.lm)
##
## Call:
## lm(formula = x ~ y + 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1940 -0.2416 0.0330 0.2671 2.9707
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.742378 0.002666 -278.5 <2e-16 ***
## y 0.500275 0.001704 293.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.424 on 258075 degrees of freedom
## Multiple R-squared: 0.2503, Adjusted R-squared: 0.2503
## F-statistic: 8.615e+04 on 1 and 258075 DF, p-value: < 2.2e-16
plot(hexbin(x,
y,
xbins = 100),
xlab='Log velocity (m/s)',
ylab='Log velocity (m/s) next time step',
legend = 0)
hist(log10(velocity),100)
hist(vel.auto.lm$residuals,100)
qqnorm(vel.auto.lm$residuals,main="Normal Q-Q plot")
qqline(vel.auto.lm$residuals)
ks.test(vel.auto.lm$residuals,'pnorm')
## Warning in ks.test.default(vel.auto.lm$residuals, "pnorm"): ties should not be
## present for the Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: vel.auto.lm$residuals
## D = 0.21517, p-value < 2.2e-16
## alternative hypothesis: two-sided
Loop through combinations
for (i in 1:dim(conditions)[1])
{
velocity <- CC.TotalData$v[
(CC.TotalData$Flow.rate==conditions[i,1] &
CC.TotalData$Chlorophyll==conditions[i,2] &
as.numeric(CC.TotalData$Guano)==conditions[i,3] &
as.numeric(CC.TotalData$Light)==conditions[i,4])]
if (length(velocity) <= 1) {
conditions[i,5] <- NA } else {
c<-cor.test(log10(velocity[1:length(velocity)-1]),
log10(velocity[2:length(velocity)]))
conditions[i,5] <- c$estimate
v0 <- log10(velocity[1:length(velocity)-1])
v1 <- log10(velocity[2:length(velocity)])
df <- data.frame(v0,v1)
fit <- lm(v1 ~ v0 + 1, data = df)
conditions[i,6] <- fit$coefficients[1]
conditions[i,7] <- fit$coefficients[2]
d <- dip.test(velocity)
conditions[i,8] <- d$p.value
hist(log10(velocity),100,
main=d$p.value)
}
}
## n = 258078 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 116154 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 266817 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 211577 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 253471 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 192727 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 271850 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 91500 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 137251 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 282203 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 140189 > max_n{n in table} = 72000 -- using that as asymptotic value.
## n = 142013 > max_n{n in table} = 72000 -- using that as asymptotic value.
conditions <- setNames(conditions,c("flow.rate","chlorophyll","guano","light","corr.val","slope","intercept","dip.test"))
Using random forest to fit the missing values
Idata <- which(!is.na(conditions[,5])) # Index of data
Iblank <- which(is.na(conditions[,5])) # Index of blank data
conditions_data <- conditions[Idata,]
conditions_blank <- conditions[Iblank,]
# Tidy Models version
#data_split <- initial_split(conditions_data, prop = 0.6)
#data_test <- testing(data_split)
#data_train <- training(data_split)
#data_folds <- vfold_cv(data_train,
# v = 5, # number of partitions in dataset
# repeats = 1)
#mc_split <- mc_cv(data_train,
# prop = 9/10, # proportion to use per resample
# times = 20) # number of resamples
#template_data <- training(data_split)
# Random forest package version - doing dip test
conditions.rf <- randomForest(dip.test ~ .,
data=select(conditions_data,c(1,2,3,4,8)),
importance=TRUE,
proximity=TRUE)
#print(conditions.rf)
#round(importance(conditions.rf), 2)
#sqrt(conditions.rf$mse[which.min(conditions.rf$mse)])
plot(conditions.rf)
varImpPlot(conditions.rf)
conditions_blank$dip.test <- predict(conditions.rf,conditions_blank[,1:4])
fit.dip.test <- predict(conditions.rf,conditions_data[,1:4])
plot(fit.dip.test,conditions_data$dip.test)
plot(conditions_blank$chlorophyll,conditions_blank$dip.test)
plot(conditions_blank$flow.rate,conditions_blank$dip.test)
plot(conditions_blank$light,conditions_blank$dip.test)
plot(conditions_blank$guano,conditions_blank$dip.test)
Random forest test for corr
conditions.rf <- randomForest(corr.val ~ .,
data=select(conditions_data,c(1,2,3,4,5)),
importance=TRUE,
proximity=TRUE)
#print(conditions.rf)
#round(importance(conditions.rf), 2)
#sqrt(conditions.rf$mse[which.min(conditions.rf$mse)])
plot(conditions.rf)
varImpPlot(conditions.rf)
conditions_blank$corr.val <- predict(conditions.rf,conditions_blank[,1:4])
fit.corr.val <- predict(conditions.rf,conditions_data[,1:4])
plot(fit.corr.val,conditions_data$corr.val)
plot(conditions_blank$chlorophyll,conditions_blank$corr.val)
plot(conditions_blank$flow.rate,conditions_blank$corr.val)
plot(conditions_blank$light,conditions_blank$corr.val)
plot(conditions_blank$guano,conditions_blank$corr.val)